This study allows us to revisit/renew
- Regression modeling
- Properties of Least Squares/Fitting “a line”
- Multiple observation
Datasets for this study are
- The main file: gauge.txt
- Supplementary large-scale files: download the following folder Full Resolution Data.zip More information about the supplementary file can be found at http://iabp.apl.washington.edu/data.html as well as http://nsidc.org/data/G00791
Question
The aim of this lab is to provide a simple procedure for converting gain into density when the gauge is in operation. Keep in mind that the experiment was conducted by varying density and measuring the response in gain, but when the gauge is ultimately in use, the snow-pack density is to be estimated from the measured gain.
Setup
df <- read.table('gauge-1wb1wa6-2gpel41.txt', header=TRUE)
df <- df[order(df$density), ] # Sort from least to greatest density
m <- 9 # Number of distinct block densities
t <- 10 # Number of replicate measurements
#install.packages('L1pack')
#install.packages("quantreg")
library(L1pack) # Used for least absolute deviations regression line
library(quantreg) # Used for quantile regression line
Loading required package: SparseM
Attaching package: ‘SparseM’
The following object is masked from ‘package:base’:
backsolve
Scenario 1: Fitting
Use the data to fit the gain, or a transformation of gain, to density. Try sketching the least squares line on a scatter plot.
- Do the residuals indicate any problems with the fit?
- If the densities of the polyethylene blocks are not reported exactly, how might this affect the fit?
- What if the blocks of polyethylene were not measured in random order (location)?
# Plot raw data
title <- 'Density vs. Gain'
x.axis <- expression('Density (g/cm'^3*')')
y.axis <- 'Gain'
x.range <- c(0, .7)
y.range <- c(2.5, 6)
plot(df, main=title, xlab=x.axis, ylab=y.axis, xlim=x.range)

# Take log transformation of response variable (gain)
y.log.axis = 'log(Gain)'
df.log = data.frame(df['density'], log(df['gain']))
plot(df.log, main=title, xlab=x.axis, ylab=y.log.axis, xlim=x.range, ylim=y.range)

# Average replicate measurements
df.log.avg = aggregate(list(gain=df.log$gain), by=list(density=df.log$density), FUN=mean)
plot(df.log.avg, main=title, xlab=x.axis, ylab=y.log.axis, xlim=x.range, ylim=y.range)

# Fit gain to density
least.squares <- lm(gain~density, data=df.log.avg)
lad <- lad(gain~density, data=df.log.avg)
quant <- rq(gain~density, tau=.5, data=df.log.avg)
plot(df.log, main=title, xlab=x.axis, ylab=y.log.axis, xlim=x.range, ylim=y.range)
abline(least.squares, col='red')
abline(lad, col='blue')
abline(quant, col='green')
legend('topright', legend=c('Least Squares Regression Line', 'Least Absolute Deviations Regression Line', '50% Quantile Regression Line'), col=c('red', 'blue', 'green'), lty=1)

c(cor(df.log.avg), summary(least.squares)$r.squared)
<<<<<<< HEAD
[1] 1.0000000 -0.9984469 -0.9984469 1.0000000 0.9968963
least.squares
=======
[1] 1.0000000 -0.9984469 -0.9984469 1.0000000 0.9968963
least.squares
>>>>>>> d4f5f9c0a005e74c03c52f2335922402dbf39c0f
Call:
lm(formula = gain ~ density, data = df.log.avg)
Coefficients:
(Intercept) density
5.997 -4.606
lad
Call:
lad(formula = gain ~ density, data = df.log.avg)
Converged in 4 iterations
Coefficients:
(Intercept) density
5.9850 -4.5935
Degrees of freedom: 9 total; 7 residual
Scale estimate: 0.06926379
quant
Call:
rq(formula = gain ~ density, tau = 0.5, data = df.log.avg)
Coefficients:
(Intercept) density
5.985029 -4.593460
Degrees of freedom: 9 total; 7 residual
# Check conditions for linear regression: linearity, normality of residuals, and constant variability
least.squares.residuals <- data.frame(df.log['density'], df.log['gain'] - rep(predict(least.squares), each=10))
lad.residuals <- data.frame(df.log['density'], df.log['gain'] - rep(predict(lad), each=10))
quant.residuals <- data.frame(df.log['density'], df.log['gain'] - rep(predict(quant), each=10))
title.residuals1 <- 'Residuals of Least Squares Regression Line'
title.residuals2 <- 'Residuals of Least Absolute Deviations Regression Line'
title.residuals3 <- 'Residuals of 50% Quantile Regression Line'
plot(least.squares.residuals$gain, main=title.residuals1, ylab=y.axis)
abline(0, 0, col='red')

plot(lad.residuals$gain, main=title.residuals2, ylab=y.axis)
abline(0, 0, col='blue')

plot(quant.residuals$gain, main=title.residuals3, ylab=y.axis)
abline(0, 0, col='green')

num.bins <- 12
hist(least.squares.residuals$gain, breaks=num.bins, main=title.residuals1, xlab=y.axis, col='red')

hist(lad.residuals$gain, breaks=num.bins, main=title.residuals2, xlab=y.axis, col='blue')

hist(quant.residuals$gain, breaks=num.bins, main=title.residuals3, xlab=y.axis, col='green')

qqnorm(least.squares.residuals$gain, main=paste('Normal Q-Q Plot with', title.residuals1), cex.main=1)
qqline(least.squares.residuals$gain, col='red')

qqnorm(lad.residuals$gain, main=paste('Normal Q-Q Plot with', title.residuals2), cex.main=1)
qqline(lad.residuals$gain, col='blue')

qqnorm(quant.residuals$gain, main=paste('Normal Q-Q Plot with', title.residuals3), cex.main=1)
qqline(quant.residuals$gain, col='green')

Scenario 2: Predicting
Ultimately we are interested in answering questions such as: Given a gain reading of 38.6, what is the density of the snow-pack? Given a gain reading of 426.7, what is the density of the snow-pack? These two numeric values, 38.6 and 426.7, were chosen because they are the average gains for the 0.508 and 0.001 densities, respectively.
- Develop a procedure for adding bands around your least squares line that can be used to make interval estimates for the snow-pack density from gain measurements. Keep in mind how the data were collected: several measurements of gain were taken for polyenythylene blocks of known density.
# Predictions
PredictLogGain <- function(density)
predict(least.squares, data.frame(density=density)) # Predict log(gain) using density
PredictDensityLeastSquares <- function(gain) {
intercept <- coef(least.squares)[[1]]
slope <- coef(least.squares)[[2]]
(log(gain) - intercept) / slope # Predict density using gain
}
PredictDensityLad <- function(gain) {
intercept <- coef(lad)[[1]]
slope <- coef(lad)[[2]]
(log(gain) - intercept) / slope # Predict density using gain
}
PredictDensityQuant <- function(gain) {
intercept <- coef(quant)[[1]]
slope <- coef(quant)[[2]]
(log(gain) - intercept) / slope # Predict density using gain
}
# 95% prediction and confidence intervals of log(gain) using density
t <- qt(.975, df=m-2)
mean.density <- mean(df.log.avg$density)
summation <- sum((df.log.avg$density - mean.density) ^ 2)
s2 <- aggregate(list(variance=least.squares.residuals$gain), by=list(density=least.squares.residuals$density), FUN=var)
s.pooled <- sqrt(mean(s2$variance))
center.expr <- quote(center <- PredictLogGain(density))
ci.width.expr <- quote(width <- t * s.pooled * sqrt(1/m + (density-mean.density)^2 / summation))
pi.width.expr <- quote(width <- t * s.pooled * sqrt(1 + 1/m + (density-mean.density)^2 / summation))
LogGainCiLower <- function(density) {
eval(center.expr)
eval(ci.width.expr)
center - width
}
LogGainCiUpper <- function(density) {
eval(center.expr)
eval(ci.width.expr)
center + width
}
LogGainPiLower <- function(density) {
eval(center.expr)
eval(pi.width.expr)
center - width
}
LogGainPiUpper <- function(density) {
eval(center.expr)
eval(pi.width.expr)
center + width
}
# Add bands around least squares line
plot(df.log, main=title, xlab=x.axis, ylab=y.log.axis, xlim=x.range, ylim=y.range)
abline(least.squares, col='red')
ci.col <- 'purple'
pi.col <- 'blue'
symbol <- '-'
size <- 1.5
line.type <- 3
line.width <- 0.7
confidence.intervals <- data.frame(density=df.log.avg$density, lower=LogGainCiLower(df.log.avg$density), upper=LogGainCiUpper(df.log.avg$density))
points(x=confidence.intervals$density, y=confidence.intervals$lower, col=ci.col, pch=symbol, cex=size)
points(x=confidence.intervals$density, y=confidence.intervals$upper, col=ci.col, pch=symbol, cex=size)
lines(x=confidence.intervals$density, y=confidence.intervals$lower, col=ci.col, lty=line.type, lwd=line.width)
lines(x=confidence.intervals$density, y=confidence.intervals$upper, col=ci.col, lty=line.type, lwd=line.width)
prediction.intervals <- data.frame(density=df.log.avg$density, lower=LogGainPiLower(df.log.avg$density), upper=LogGainPiUpper(df.log.avg$density))
points(x=prediction.intervals$density, y=prediction.intervals$lower, col=pi.col, pch=symbol, cex=size)
points(x=prediction.intervals$density, y=prediction.intervals$upper, col=pi.col, pch=symbol, cex=size)
lines(x=prediction.intervals$density, y=prediction.intervals$lower, col=pi.col, lty=line.type, lwd=line.width)
lines(x=prediction.intervals$density, y=prediction.intervals$upper, col=pi.col, lty=line.type, lwd=line.width)
legend('topright', legend=c('Least Squares Regression Line', '95% Confidence Interval Bands', '95% Prediction Interval Bands'), col=c('red', ci.col, pi.col), lty=1)

# 95% prediction and confidence intervals of density using gain
end.points <- c(-1, 3) # Interval to search the root in
DensityCi <- function(gain) {
lower <- uniroot(function(density) log(gain) - LogGainCiLower(density), interval=end.points)[[1]]
upper <- uniroot(function(density) log(gain) - LogGainCiUpper(density), interval=end.points)[[1]]
c(lower, upper)
}
DensityPi <- function(gain) {
lower <- uniroot(function(density) log(gain) - LogGainPiLower(density), end.points)[[1]]
upper <- uniroot(function(density) log(gain) - LogGainPiUpper(density), end.points)[[1]]
c(lower, upper)
}
# Point and interval estimates for example gain readings
PredictDensityLeastSquares(38.6) # 38.6 is the average gain for 0.508 density
[1] 0.5089113
PredictDensityLad(38.6)
[1] 0.5076298
PredictDensityQuant(38.6)
[1] 0.5076298
DensityCi(38.6)
[1] 0.5011879 0.5169000
DensityPi(38.6)
[1] 0.4889568 0.5291323
PredictDensityLeastSquares(426.7) # 426.7 is the average gain for 0.001 density
[1] -0.01276954
PredictDensityLad(426.7)
[1] -0.01546807
PredictDensityQuant(426.7)
[1] -0.01546811
DensityCi(426.7)
[1] -0.024285866 -0.001769193
DensityPi(426.7)
[1] -0.034644666 0.008618629
Scenario 3: Cross-Validation
To check how well your procedure works, omit the set of measurements corresponding to the block of density 0.508, apply your “estimation”/calibration procedure to the remaining data, and provide an interval estimate for the density of a block with an average reading of 38.6. Where does the actual density fall in the interval? Try the same test, for the set of measurements at the 0.001 density.
for (omitted in c(0.508, 0.001)) {
# Omit measurements corresponding to the specified density
df.log.omitted = df.log[which(df.log['density'] != omitted), ]
df.log.avg.omitted <- df.log.avg[which(df.log.avg['density'] != omitted), ]
# Redo calculations using modified dataset
least.squares <- lm(gain~density, data=df.log.avg.omitted)
mean.density <- mean(df.log.avg.omitted$density)
summation <- sum((df.log.avg.omitted$density - mean.density) ^ 2)
s2 <- aggregate(list(variance=least.squares.residuals$gain), by=list(density=least.squares.residuals$density), FUN=var)
s.pooled <- sqrt(mean(s2$variance))
ci.width.expr <- quote(width <- t * s.pooled * sqrt(1/(m-1) + (density-mean.density)^2 / summation))
pi.width.expr <- quote(width <- t * s.pooled * sqrt(1 + 1/(m-1) + (density-mean.density)^2 / summation))
plot(df.log.omitted, main=title, xlab=x.axis, ylab=y.log.axis, xlim=x.range, ylim=y.range)
abline(least.squares, col='red')
ci.col <- 'purple'
pi.col <- 'blue'
symbol <- '-'
size <- 1.5
line.type <- 3
line.width <- 0.7
confidence.intervals <- data.frame(density=df.log.avg.omitted$density, lower=LogGainCiLower(df.log.avg.omitted$density), upper=LogGainCiUpper(df.log.avg.omitted$density))
points(x=confidence.intervals$density, y=confidence.intervals$lower, col=ci.col, pch=symbol, cex=size)
points(x=confidence.intervals$density, y=confidence.intervals$upper, col=ci.col, pch=symbol, cex=size)
lines(x=confidence.intervals$density, y=confidence.intervals$lower, col=ci.col, lty=line.type, lwd=line.width)
lines(x=confidence.intervals$density, y=confidence.intervals$upper, col=ci.col, lty=line.type, lwd=line.width)
prediction.intervals <- data.frame(density=df.log.avg.omitted$density, lower=LogGainPiLower(df.log.avg.omitted$density), upper=LogGainPiUpper(df.log.avg.omitted$density))
points(x=prediction.intervals$density, y=prediction.intervals$lower, col=pi.col, pch=symbol, cex=size)
points(x=prediction.intervals$density, y=prediction.intervals$upper, col=pi.col, pch=symbol, cex=size)
lines(x=prediction.intervals$density, y=prediction.intervals$lower, col=pi.col, lty=line.type, lwd=line.width)
lines(x=prediction.intervals$density, y=prediction.intervals$upper, col=pi.col, lty=line.type, lwd=line.width)
legend('topright', legend=c('Least Squares Regression Line', '95% Confidence Interval Bands', '95% Prediction Interval Bands'), col=c('red', ci.col, pi.col), lty=1)
print(PredictDensityLeastSquares(38.6)) # 38.6 is the average gain for 0.508 density
print(DensityCi(38.6))
print(DensityPi(38.6))
print(PredictDensityLeastSquares(426.7)) # 426.7 is the average gain for 0.001 density
print(DensityCi(426.7))
print(DensityPi(426.7))
}
[1] 0.5091927
[1] 0.5006695 0.5180406
[1] 0.4889184 0.5297925
[1] -0.0128045
[1] -0.024342164 -0.001790802
[1] -0.034760736 0.008598777

[1] 0.5092919
[1] 0.5014370 0.5174323
[1] 0.4890257 0.5298473
[1] -0.02051733
[1] -0.035344991 -0.006521825
[1] -0.044604850 0.002738127

Additional Scenario: Temperature, DOY, and Latitude.
Use the additional dataset to construct a model fitting temperature with DOY, latitude, and other reasonable features. Try sketching the least squares line on a scatter plot. We aim to investigate the relationship between temperature and the DOY, and its latitude.
# Check the correlation
data <- read.csv('Full Resolution Data/64506420.csv', header=TRUE)
data <- data[,c('Hour','DOY','POS_DOY','Lat','Lon','Ts','BP')]
# Drop the extreme outlier case
#data <- data[which(data$Ts>-200),]
data_matrix <- as.matrix(data)
# Correlation Matrix
corr_matrix <- cor(data_matrix)
corr_matrix
Hour DOY POS_DOY Lat Lon Ts BP
Hour 1.000000000 -0.006779489 -0.006775002 -0.007511024 0.01217860 0.008592576 0.02505819
DOY -0.006779489 1.000000000 0.999999958 0.903704617 -0.68012490 -0.906918658 -0.17232397
POS_DOY -0.006775002 0.999999958 1.000000000 0.903696909 -0.68013445 -0.906916964 -0.17230481
Lat -0.007511024 0.903704617 0.903696909 1.000000000 -0.59748962 -0.958835395 -0.29666821
Lon 0.012178596 -0.680124899 -0.680134450 -0.597489620 1.00000000 0.564136723 -0.02981279
Ts 0.008592576 -0.906918658 -0.906916964 -0.958835395 0.56413672 1.000000000 0.20223136
BP 0.025058188 -0.172323969 -0.172304805 -0.296668215 -0.02981279 0.202231363 1.00000000
# Group by DOY and average replicated measurements
data$DOY <- round(data$DOY,0)
data.avg = aggregate(list(data=data[,c('Ts','Lat')]), by=list(DOY=data$DOY), FUN=mean)
# least squares line
ggplot(data.avg,aes(x=data.avg$DOY, y=data.avg$data.Ts)) +
geom_point(color='#2980B9', size = 4) +
<<<<<<< HEAD
geom_smooth(method=lm, color='#2C3E50') +ggtitle(label ="Least Squares Regression Line") + xlab("Day Of Year") +
ylab("Temperature")

fit1<-lm(formula = data.Ts ~ DOY, data = data.avg)
summary(fit1)
Call:
lm(formula = data.Ts ~ DOY, data = data.avg)
Residuals:
Min 1Q Median 3Q Max
-0.95439 -0.34633 0.03139 0.35247 0.84693
=======
<<<<<<< HEAD
geom_smooth(method=lm, color='#2C3E50')

ggplot(data,aes(x=Lat, y=Ts)) +
geom_point(color='#2980B9', size = 4) +
geom_smooth(method=lm, color='#2C3E50')

title.residuals1 <- 'Residuals of Least Squares Regression Line'
plot(fit$residuals, main=title.residuals1)
abline(0, 0, col='red')

num.bins <- 10
hist(fit$residuals, breaks=num.bins, main=title.residuals1, xlab='Temperature', col='red')

qqnorm(fit$residuals, main=paste('Normal Q-Q Plot with', title.residuals1), cex.main=1)
qqline(fit$residuals, col='red')

=======
geom_smooth(method=lm, color='#2C3E50')
>>>>>>> d4f5f9c0a005e74c03c52f2335922402dbf39c0f
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 9.353218 0.114242 81.87 <2e-16 ***
DOY -0.040253 0.001989 -20.23 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.4741 on 86 degrees of freedom
Multiple R-squared: 0.8264, Adjusted R-squared: 0.8244
F-statistic: 409.4 on 1 and 86 DF, p-value: < 2.2e-16
ggplot(data.avg,aes(x=data.avg$data.Lat, y=data.avg$data.Ts)) +
geom_point(color='#2980B9', size = 4) +
geom_smooth(method=lm, color='#2C3E50') +ggtitle(label ="Least Squares Regression Line")+ xlab("Lattitude") +
ylab("Temperature")

fit2<-lm(formula = data.Ts ~ data.Lat, data = data.avg)
summary(fit2)
Call:
lm(formula = data.Ts ~ data.Lat, data = data.avg)
Residuals:
Min 1Q Median 3Q Max
-0.51377 -0.26924 -0.04201 0.24403 0.64902
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 47.9901 1.2672 37.87 <2e-16 ***
data.Lat -0.6202 0.0193 -32.14 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.3155 on 86 degrees of freedom
Multiple R-squared: 0.9231, Adjusted R-squared: 0.9222
F-statistic: 1033 on 1 and 86 DF, p-value: < 2.2e-16
# Polynomial Regression Line
fit3<-lm(formula = data.Ts ~ DOY + data.Lat, data = data.avg)
summary(fit3)
<<<<<<< HEAD
Call:
lm(formula = data.Ts ~ DOY + data.Lat, data = data.avg)
Residuals:
Min 1Q Median 3Q Max
-0.60721 -0.22496 -0.00537 0.25822 0.61356
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 40.049638 2.673979 14.978 < 2e-16 ***
DOY -0.009756 0.002936 -3.322 0.00132 **
data.Lat -0.491566 0.042805 -11.484 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.2985 on 85 degrees of freedom
Multiple R-squared: 0.932, Adjusted R-squared: 0.9304
F-statistic: 582.1 on 2 and 85 DF, p-value: < 2.2e-16
qqnorm(fit2$residuals, main=paste('Normal Q-Q Plot with', title.residuals1), cex.main=1)
qqline(fit2$residuals, col='red')
=======
>>>>>>> 8826eee1488bdd27557439752deda8eb4e450389
# Attempt for Multiple Regressions
data$Midnights <- 0
data$Midnights[which(data$Hour<6)] <- 1
data$Mornings <- 0
data$Mornings[which(data$Hour>=6 & data$Hour<11)] <- 1
data$Noon <- 0
data$Noon[which(data$Hour>=11 & data$Hour<15)] <- 1
data$Afternoon <- 0
data$Afternoon[which(data$Hour>=15 & data$Hour<20)] <- 1
data$Nights <- 0
data$Nights[which(data$Hour>=20)] <- 1
>>>>>>> d4f5f9c0a005e74c03c52f2335922402dbf39c0f

title.residuals1 <- 'Residuals of Least Square Regression Line'
plot(fit2$residuals, main=title.residuals1, ylab = "Standardized Residuals")
abline(0, 0, col='red')

<<<<<<< HEAD
---
title: 'CASE STUDY 4:'
output: html_notebook
---

This study allows us to revisit/renew

1. Regression modeling
2. Properties of Least Squares/Fitting "a line"
3. Multiple observation

Datasets for this study are

1. The main file: gauge.txt
2. Supplementary large-scale files: download the following folder Full Resolution Data.zip More information about the supplementary file can be found at http://iabp.apl.washington.edu/data.html as well as http://nsidc.org/data/G00791


## Question
The aim of this lab is to provide a simple procedure for converting gain into density when the gauge is in operation. Keep in mind that the experiment was conducted by varying density and measuring the response in gain, but when the gauge is ultimately in use, the snow-pack density is to be estimated from the measured gain.


## Setup
```{r}
df <- read.table('gauge-1wb1wa6-2gpel41.txt', header=TRUE)
df <- df[order(df$density), ]  # Sort from least to greatest density

m <- 9  # Number of distinct block densities
t <- 10  # Number of replicate measurements

#install.packages('L1pack')
#install.packages("quantreg")
library(L1pack)  # Used for least absolute deviations regression line
library(quantreg)  # Used for quantile regression line
```


## Scenario 1: Fitting
Use the data to fit the gain, or a transformation of gain, to density. Try sketching the least squares line on a scatter plot.

* Do the residuals indicate any problems with the fit?
* If the densities of the polyethylene blocks are not reported exactly, how might this affect the fit?
* What if the blocks of polyethylene were not measured in random order (location)?
```{r}
# Plot raw data
title <- 'Density vs. Gain'
x.axis <- expression('Density (g/cm'^3*')')
y.axis <- 'Gain'
x.range <- c(0, .7)
y.range <- c(2.5, 6)
plot(df, main=title, xlab=x.axis, ylab=y.axis, xlim=x.range)


# Take log transformation of response variable (gain)
y.log.axis = 'log(Gain)'
df.log = data.frame(df['density'], log(df['gain']))
plot(df.log, main=title, xlab=x.axis, ylab=y.log.axis, xlim=x.range, ylim=y.range)


# Average replicate measurements
df.log.avg = aggregate(list(gain=df.log$gain), by=list(density=df.log$density), FUN=mean)
plot(df.log.avg, main=title, xlab=x.axis, ylab=y.log.axis, xlim=x.range, ylim=y.range)


# Fit gain to density
least.squares <- lm(gain~density, data=df.log.avg)
lad <- lad(gain~density, data=df.log.avg)
quant <- rq(gain~density, tau=.5, data=df.log.avg)

plot(df.log, main=title, xlab=x.axis, ylab=y.log.axis, xlim=x.range, ylim=y.range)
abline(least.squares, col='red')
abline(lad, col='blue')
abline(quant, col='green')
legend('topright', legend=c('Least Squares Regression Line', 'Least Absolute Deviations Regression Line', '50% Quantile Regression Line'), col=c('red', 'blue', 'green'), lty=1)

c(cor(df.log.avg), summary(least.squares)$r.squared)
least.squares
lad
quant


# Check conditions for linear regression: linearity, normality of residuals, and constant variability
least.squares.residuals <- data.frame(df.log['density'], df.log['gain'] - rep(predict(least.squares), each=10))
lad.residuals <- data.frame(df.log['density'], df.log['gain'] - rep(predict(lad), each=10))
quant.residuals <- data.frame(df.log['density'], df.log['gain'] - rep(predict(quant), each=10))

title.residuals1 <- 'Residuals of Least Squares Regression Line'
title.residuals2 <- 'Residuals of Least Absolute Deviations Regression Line'
title.residuals3 <- 'Residuals of 50% Quantile Regression Line'
plot(least.squares.residuals$gain, main=title.residuals1, ylab=y.axis)
abline(0, 0, col='red')
plot(lad.residuals$gain, main=title.residuals2, ylab=y.axis)
abline(0, 0, col='blue')
plot(quant.residuals$gain, main=title.residuals3, ylab=y.axis)
abline(0, 0, col='green')

num.bins <- 12
hist(least.squares.residuals$gain, breaks=num.bins, main=title.residuals1, xlab=y.axis, col='red')
hist(lad.residuals$gain, breaks=num.bins, main=title.residuals2, xlab=y.axis, col='blue')
hist(quant.residuals$gain, breaks=num.bins, main=title.residuals3, xlab=y.axis, col='green')

qqnorm(least.squares.residuals$gain, main=paste('Normal Q-Q Plot with', title.residuals1), cex.main=1)
qqline(least.squares.residuals$gain, col='red')
qqnorm(lad.residuals$gain, main=paste('Normal Q-Q Plot with', title.residuals2), cex.main=1)
qqline(lad.residuals$gain, col='blue')
qqnorm(quant.residuals$gain, main=paste('Normal Q-Q Plot with', title.residuals3), cex.main=1)
qqline(quant.residuals$gain, col='green')
```


## Scenario 2: Predicting
Ultimately we are interested in answering questions such as: Given a gain reading of 38.6, what is the density of the snow-pack? Given a gain reading of 426.7, what is the density of the snow-pack? These two numeric values, 38.6 and 426.7, were chosen because they are the average gains for the 0.508 and 0.001 densities, respectively.

* Develop a procedure for adding bands around your least squares line that can be used to make interval estimates for the snow-pack density from gain measurements. Keep in mind how the data were collected: several measurements of gain were taken for polyenythylene blocks of known density.
```{r fig.asp=2, fig.width=5}
# Predictions
PredictLogGain <- function(density)
  predict(least.squares, data.frame(density=density))  # Predict log(gain) using density

PredictDensityLeastSquares <- function(gain) {
  intercept <- coef(least.squares)[[1]]
  slope <- coef(least.squares)[[2]]
  (log(gain) - intercept) / slope  # Predict density using gain
}

PredictDensityLad <- function(gain) {
  intercept <- coef(lad)[[1]]
  slope <- coef(lad)[[2]]
  (log(gain) - intercept) / slope  # Predict density using gain
}

PredictDensityQuant <- function(gain) {
  intercept <- coef(quant)[[1]]
  slope <- coef(quant)[[2]]
  (log(gain) - intercept) / slope  # Predict density using gain
}


# 95% prediction and confidence intervals of log(gain) using density
t <- qt(.975, df=m-2)
mean.density <- mean(df.log.avg$density)
summation <- sum((df.log.avg$density - mean.density) ^ 2)

s2 <- aggregate(list(variance=least.squares.residuals$gain), by=list(density=least.squares.residuals$density), FUN=var)
s.pooled <- sqrt(mean(s2$variance))

center.expr <- quote(center <- PredictLogGain(density))
ci.width.expr <- quote(width <- t * s.pooled * sqrt(1/m + (density-mean.density)^2 / summation))
pi.width.expr <- quote(width <- t * s.pooled * sqrt(1 + 1/m + (density-mean.density)^2 / summation))

LogGainCiLower <- function(density) {
  eval(center.expr)
  eval(ci.width.expr)
  center - width
}

LogGainCiUpper <- function(density) {
  eval(center.expr)
  eval(ci.width.expr)
  center + width
}

LogGainPiLower <- function(density) {
  eval(center.expr)
  eval(pi.width.expr)
  center - width
}

LogGainPiUpper <- function(density) {
  eval(center.expr)
  eval(pi.width.expr)
  center + width
}


# Add bands around least squares line
plot(df.log, main=title, xlab=x.axis, ylab=y.log.axis, xlim=x.range, ylim=y.range)
abline(least.squares, col='red')

ci.col <- 'purple'
pi.col <- 'blue'
symbol <- '-'
size <- 1.5
line.type <- 3
line.width <- 0.7

confidence.intervals <- data.frame(density=df.log.avg$density, lower=LogGainCiLower(df.log.avg$density), upper=LogGainCiUpper(df.log.avg$density))
points(x=confidence.intervals$density, y=confidence.intervals$lower, col=ci.col, pch=symbol, cex=size)
points(x=confidence.intervals$density, y=confidence.intervals$upper, col=ci.col, pch=symbol, cex=size)
lines(x=confidence.intervals$density, y=confidence.intervals$lower, col=ci.col, lty=line.type, lwd=line.width)
lines(x=confidence.intervals$density, y=confidence.intervals$upper, col=ci.col, lty=line.type, lwd=line.width)

prediction.intervals <- data.frame(density=df.log.avg$density, lower=LogGainPiLower(df.log.avg$density), upper=LogGainPiUpper(df.log.avg$density))
points(x=prediction.intervals$density, y=prediction.intervals$lower, col=pi.col, pch=symbol, cex=size)
points(x=prediction.intervals$density, y=prediction.intervals$upper, col=pi.col, pch=symbol, cex=size)
lines(x=prediction.intervals$density, y=prediction.intervals$lower, col=pi.col, lty=line.type, lwd=line.width)
lines(x=prediction.intervals$density, y=prediction.intervals$upper, col=pi.col, lty=line.type, lwd=line.width)

legend('topright', legend=c('Least Squares Regression Line', '95% Confidence Interval Bands', '95% Prediction Interval Bands'), col=c('red', ci.col, pi.col), lty=1)


# 95% prediction and confidence intervals of density using gain
end.points <- c(-1, 3)  # Interval to search the root in

DensityCi <- function(gain) {
  lower <- uniroot(function(density) log(gain) - LogGainCiLower(density), interval=end.points)[[1]]
  upper <- uniroot(function(density) log(gain) - LogGainCiUpper(density), interval=end.points)[[1]]
  c(lower, upper)
}

DensityPi <- function(gain) {
  lower <- uniroot(function(density) log(gain) - LogGainPiLower(density), end.points)[[1]]
  upper <- uniroot(function(density) log(gain) - LogGainPiUpper(density), end.points)[[1]]
  c(lower, upper)
}


# Point and interval estimates for example gain readings
PredictDensityLeastSquares(38.6)  # 38.6 is the average gain for 0.508 density
PredictDensityLad(38.6)
PredictDensityQuant(38.6)
DensityCi(38.6)
DensityPi(38.6)

PredictDensityLeastSquares(426.7)  # 426.7 is the average gain for 0.001 density
PredictDensityLad(426.7)
PredictDensityQuant(38.6)
DensityCi(426.7)
DensityPi(426.7)
```


## Scenario 3: Cross-Validation
To check how well your procedure works, omit the set of measurements corresponding to the block of density 0.508, apply your "estimation"/calibration procedure to the remaining data, and provide an interval estimate for the density of a block with an average reading of 38.6. Where does the actual density fall in the interval? Try the same test, for the set of measurements at the 0.001 density.
```{r fig.asp=2, fig.width=5}
for (omitted in c(0.508, 0.001)) {
  # Omit measurements corresponding to the specified density
  df.log.omitted = df.log[which(df.log['density'] != omitted), ]
  df.log.avg.omitted <- df.log.avg[which(df.log.avg['density'] != omitted), ]
  
  
  # Redo calculations using modified dataset
  least.squares <- lm(gain~density, data=df.log.avg.omitted)
  
  mean.density <- mean(df.log.avg.omitted$density)
  summation <- sum((df.log.avg.omitted$density - mean.density) ^ 2)
  
  s2 <- aggregate(list(variance=least.squares.residuals$gain), by=list(density=least.squares.residuals$density), FUN=var)
  s.pooled <- sqrt(mean(s2$variance))
  
  ci.width.expr <- quote(width <- t * s.pooled * sqrt(1/(m-1) + (density-mean.density)^2 / summation))
  pi.width.expr <- quote(width <- t * s.pooled * sqrt(1 + 1/(m-1) + (density-mean.density)^2 / summation))
  
  plot(df.log.omitted, main=title, xlab=x.axis, ylab=y.log.axis, xlim=x.range, ylim=y.range)
  abline(least.squares, col='red')
  
  ci.col <- 'purple'
  pi.col <- 'blue'
  symbol <- '-'
  size <- 1.5
  line.type <- 3
  line.width <- 0.7
  
  confidence.intervals <- data.frame(density=df.log.avg.omitted$density, lower=LogGainCiLower(df.log.avg.omitted$density), upper=LogGainCiUpper(df.log.avg.omitted$density))
  points(x=confidence.intervals$density, y=confidence.intervals$lower, col=ci.col, pch=symbol, cex=size)
  points(x=confidence.intervals$density, y=confidence.intervals$upper, col=ci.col, pch=symbol, cex=size)
  lines(x=confidence.intervals$density, y=confidence.intervals$lower, col=ci.col, lty=line.type, lwd=line.width)
  lines(x=confidence.intervals$density, y=confidence.intervals$upper, col=ci.col, lty=line.type, lwd=line.width)
  
  prediction.intervals <- data.frame(density=df.log.avg.omitted$density, lower=LogGainPiLower(df.log.avg.omitted$density), upper=LogGainPiUpper(df.log.avg.omitted$density))
  points(x=prediction.intervals$density, y=prediction.intervals$lower, col=pi.col, pch=symbol, cex=size)
  points(x=prediction.intervals$density, y=prediction.intervals$upper, col=pi.col, pch=symbol, cex=size)
  lines(x=prediction.intervals$density, y=prediction.intervals$lower, col=pi.col, lty=line.type, lwd=line.width)
  lines(x=prediction.intervals$density, y=prediction.intervals$upper, col=pi.col, lty=line.type, lwd=line.width)
  
  legend('topright', legend=c('Least Squares Regression Line', '95% Confidence Interval Bands', '95% Prediction Interval Bands'), col=c('red', ci.col, pi.col), lty=1)
  
  print(PredictDensityLeastSquares(38.6))  # 38.6 is the average gain for 0.508 density
  print(DensityCi(38.6))
  print(DensityPi(38.6))
  
  print(PredictDensityLeastSquares(426.7))  # 426.7 is the average gain for 0.001 density
  print(DensityCi(426.7))
  print(DensityPi(426.7))
}
```


## Additional Scenario: Temperature, DOY, and Latitude.
Use the additional dataset to construct a model fitting temperature with DOY, latitude, and other reasonable features. Try sketching the least squares line on a scatter plot. We aim to investigate the relationship between temperature and the DOY, and its latitude.
```{r}
# Check the correlation
data <- read.csv('Full Resolution Data/64506420.csv', header=TRUE)
data <- data[,c('Hour','DOY','POS_DOY','Lat','Lon','Ts','BP')]

# Drop the extreme outlier case
#data <- data[which(data$Ts>-200),]
data_matrix <- as.matrix(data)

# Correlation Matrix
corr_matrix <- cor(data_matrix)
corr_matrix
```

```{r}
# Group by DOY and average replicated measurements
data$DOY <- round(data$DOY,0)
data.avg = aggregate(list(data=data[,c('Ts','Lat')]), by=list(DOY=data$DOY), FUN=mean)

# least squares line
ggplot(data.avg,aes(x=data.avg$DOY, y=data.avg$data.Ts)) + 
  geom_point(color='#2980B9', size = 4) + 
  geom_smooth(method=lm, color='#2C3E50') +ggtitle(label ="Least Squares Regression Line") + xlab("Day Of Year") +
  ylab("Temperature")
fit1<-lm(formula = data.Ts ~ DOY, data = data.avg)
summary(fit1)

ggplot(data.avg,aes(x=data.avg$data.Lat, y=data.avg$data.Ts)) + 
  geom_point(color='#2980B9', size = 4) + 
  geom_smooth(method=lm, color='#2C3E50') +ggtitle(label ="Least Squares Regression Line")+ xlab("Lattitude") +
  ylab("Temperature")
fit2<-lm(formula = data.Ts ~ data.Lat, data = data.avg)
summary(fit2)

# Polynomial Regression Line
fit3<-lm(formula = data.Ts ~ DOY + data.Lat, data = data.avg)
summary(fit3)

qqnorm(fit2$residuals, main=paste('Normal Q-Q Plot with', title.residuals1), cex.main=1)
qqline(fit2$residuals, col='red')

title.residuals1 <- 'Residuals of Least Square Regression Line'
plot(fit2$residuals, main=title.residuals1, ylab = "Standardized Residuals")
abline(0, 0, col='red')

```

=======
<<<<<<< HEAD
---
title: 'CASE STUDY 4:'
output: html_notebook
---

This study allows us to revisit/renew

1. Regression modeling
2. Properties of Least Squares/Fitting "a line"
3. Multiple observation

Datasets for this study are

1. The main file: gauge.txt
2. Supplementary large-scale files: download the following folder Full Resolution Data.zip More information about the supplementary file can be found at http://iabp.apl.washington.edu/data.html as well as http://nsidc.org/data/G00791


## Question
The aim of this lab is to provide a simple procedure for converting gain into density when the gauge is in operation. Keep in mind that the experiment was conducted by varying density and measuring the response in gain, but when the gauge is ultimately in use, the snow-pack density is to be estimated from the measured gain.


## Setup
```{r}
df <- read.table('gauge-1wb1wa6-2gpel41.txt', header=TRUE)
df <- df[order(df$density), ]  # Sort from least to greatest density

m <- 9  # Number of distinct block densities
t <- 10  # Number of replicate measurements

#install.packages('L1pack')
#install.packages("quantreg")
library(L1pack)  # Used for least absolute deviations regression line
library(quantreg)  # Used for quantile regression line
```


## Scenario 1: Fitting
Use the data to fit the gain, or a transformation of gain, to density. Try sketching the least squares line on a scatter plot.

* Do the residuals indicate any problems with the fit?
* If the densities of the polyethylene blocks are not reported exactly, how might this affect the fit?
* What if the blocks of polyethylene were not measured in random order (location)?
```{r}
# Plot raw data
title <- 'Density vs. Gain'
x.axis <- expression('Density (g/cm'^3*')')
y.axis <- 'Gain'
x.range <- c(0, .7)
y.range <- c(2.5, 6)
plot(df, main=title, xlab=x.axis, ylab=y.axis, xlim=x.range)


# Take log transformation of response variable (gain)
y.log.axis = 'log(Gain)'
df.log = data.frame(df['density'], log(df['gain']))
plot(df.log, main=title, xlab=x.axis, ylab=y.log.axis, xlim=x.range, ylim=y.range)


# Average replicate measurements
df.log.avg = aggregate(list(gain=df.log$gain), by=list(density=df.log$density), FUN=mean)
plot(df.log.avg, main=title, xlab=x.axis, ylab=y.log.axis, xlim=x.range, ylim=y.range)


# Fit gain to density
least.squares <- lm(gain~density, data=df.log.avg)
lad <- lad(gain~density, data=df.log.avg)
quant <- rq(gain~density, tau=.5, data=df.log.avg)

plot(df.log, main=title, xlab=x.axis, ylab=y.log.axis, xlim=x.range, ylim=y.range)
abline(least.squares, col='red')
abline(lad, col='blue')
abline(quant, col='green')
legend('topright', legend=c('Least Squares Regression Line', 'Least Absolute Deviations Regression Line', '50% Quantile Regression Line'), col=c('red', 'blue', 'green'), lty=1)

c(cor(df.log.avg), summary(least.squares)$r.squared)
least.squares
lad
quant


# Check conditions for linear regression: linearity, normality of residuals, and constant variability
least.squares.residuals <- data.frame(df.log['density'], df.log['gain'] - rep(predict(least.squares), each=10))
lad.residuals <- data.frame(df.log['density'], df.log['gain'] - rep(predict(lad), each=10))
quant.residuals <- data.frame(df.log['density'], df.log['gain'] - rep(predict(quant), each=10))

title.residuals1 <- 'Residuals of Least Squares Regression Line'
title.residuals2 <- 'Residuals of Least Absolute Deviations Regression Line'
title.residuals3 <- 'Residuals of 50% Quantile Regression Line'
plot(least.squares.residuals$gain, main=title.residuals1, ylab=y.axis)
abline(0, 0, col='red')
plot(lad.residuals$gain, main=title.residuals2, ylab=y.axis)
abline(0, 0, col='blue')
plot(quant.residuals$gain, main=title.residuals3, ylab=y.axis)
abline(0, 0, col='green')

num.bins <- 12
hist(least.squares.residuals$gain, breaks=num.bins, main=title.residuals1, xlab=y.axis, col='red')
hist(lad.residuals$gain, breaks=num.bins, main=title.residuals2, xlab=y.axis, col='blue')
hist(quant.residuals$gain, breaks=num.bins, main=title.residuals3, xlab=y.axis, col='green')

qqnorm(least.squares.residuals$gain, main=paste('Normal Q-Q Plot with', title.residuals1), cex.main=1)
qqline(least.squares.residuals$gain, col='red')
qqnorm(lad.residuals$gain, main=paste('Normal Q-Q Plot with', title.residuals2), cex.main=1)
qqline(lad.residuals$gain, col='blue')
qqnorm(quant.residuals$gain, main=paste('Normal Q-Q Plot with', title.residuals3), cex.main=1)
qqline(quant.residuals$gain, col='green')
```


## Scenario 2: Predicting
Ultimately we are interested in answering questions such as: Given a gain reading of 38.6, what is the density of the snow-pack? Given a gain reading of 426.7, what is the density of the snow-pack? These two numeric values, 38.6 and 426.7, were chosen because they are the average gains for the 0.508 and 0.001 densities, respectively.

* Develop a procedure for adding bands around your least squares line that can be used to make interval estimates for the snow-pack density from gain measurements. Keep in mind how the data were collected: several measurements of gain were taken for polyenythylene blocks of known density.
```{r fig.asp=2, fig.width=5}
# Predictions
PredictLogGain <- function(density)
  predict(least.squares, data.frame(density=density))  # Predict log(gain) using density

PredictDensityLeastSquares <- function(gain) {
  intercept <- coef(least.squares)[[1]]
  slope <- coef(least.squares)[[2]]
  (log(gain) - intercept) / slope  # Predict density using gain
}

PredictDensityLad <- function(gain) {
  intercept <- coef(lad)[[1]]
  slope <- coef(lad)[[2]]
  (log(gain) - intercept) / slope  # Predict density using gain
}

PredictDensityQuant <- function(gain) {
  intercept <- coef(quant)[[1]]
  slope <- coef(quant)[[2]]
  (log(gain) - intercept) / slope  # Predict density using gain
}


# 95% prediction and confidence intervals of log(gain) using density
t <- qt(.975, df=m-2)
mean.density <- mean(df.log.avg$density)
summation <- sum((df.log.avg$density - mean.density) ^ 2)

s2 <- aggregate(list(variance=least.squares.residuals$gain), by=list(density=least.squares.residuals$density), FUN=var)
s.pooled <- sqrt(mean(s2$variance))

center.expr <- quote(center <- PredictLogGain(density))
ci.width.expr <- quote(width <- t * s.pooled * sqrt(1/m + (density-mean.density)^2 / summation))
pi.width.expr <- quote(width <- t * s.pooled * sqrt(1 + 1/m + (density-mean.density)^2 / summation))

LogGainCiLower <- function(density) {
  eval(center.expr)
  eval(ci.width.expr)
  center - width
}

LogGainCiUpper <- function(density) {
  eval(center.expr)
  eval(ci.width.expr)
  center + width
}

LogGainPiLower <- function(density) {
  eval(center.expr)
  eval(pi.width.expr)
  center - width
}

LogGainPiUpper <- function(density) {
  eval(center.expr)
  eval(pi.width.expr)
  center + width
}


# Add bands around least squares line
plot(df.log, main=title, xlab=x.axis, ylab=y.log.axis, xlim=x.range, ylim=y.range)
abline(least.squares, col='red')

ci.col <- 'purple'
pi.col <- 'blue'
symbol <- '-'
size <- 1.5
line.type <- 3
line.width <- 0.7

confidence.intervals <- data.frame(density=df.log.avg$density, lower=LogGainCiLower(df.log.avg$density), upper=LogGainCiUpper(df.log.avg$density))
points(x=confidence.intervals$density, y=confidence.intervals$lower, col=ci.col, pch=symbol, cex=size)
points(x=confidence.intervals$density, y=confidence.intervals$upper, col=ci.col, pch=symbol, cex=size)
lines(x=confidence.intervals$density, y=confidence.intervals$lower, col=ci.col, lty=line.type, lwd=line.width)
lines(x=confidence.intervals$density, y=confidence.intervals$upper, col=ci.col, lty=line.type, lwd=line.width)

prediction.intervals <- data.frame(density=df.log.avg$density, lower=LogGainPiLower(df.log.avg$density), upper=LogGainPiUpper(df.log.avg$density))
points(x=prediction.intervals$density, y=prediction.intervals$lower, col=pi.col, pch=symbol, cex=size)
points(x=prediction.intervals$density, y=prediction.intervals$upper, col=pi.col, pch=symbol, cex=size)
lines(x=prediction.intervals$density, y=prediction.intervals$lower, col=pi.col, lty=line.type, lwd=line.width)
lines(x=prediction.intervals$density, y=prediction.intervals$upper, col=pi.col, lty=line.type, lwd=line.width)

legend('topright', legend=c('Least Squares Regression Line', '95% Confidence Interval Bands', '95% Prediction Interval Bands'), col=c('red', ci.col, pi.col), lty=1)


# 95% prediction and confidence intervals of density using gain
interval <- c(-1, 3)  # Interval to search the root in

DensityCi <- function(gain) {
  lower <- uniroot(function(density) log(gain) - LogGainCiLower(density), interval=interval)[[1]]
  upper <- uniroot(function(density) log(gain) - LogGainCiUpper(density), interval=interval)[[1]]
  c(lower, upper)
}

DensityPi <- function(gain) {
  lower <- uniroot(function(density) log(gain) - LogGainPiLower(density), interval=interval)[[1]]
  upper <- uniroot(function(density) log(gain) - LogGainPiUpper(density), interval=interval)[[1]]
  c(lower, upper)
}


# Point and interval estimates for example gain readings
PredictDensityLeastSquares(38.6)  # 38.6 is the average gain for 0.508 density
PredictDensityLad(38.6)
PredictDensityQuant(38.6)
DensityCi(38.6)
DensityPi(38.6)

PredictDensityLeastSquares(426.7)  # 426.7 is the average gain for 0.001 density
PredictDensityLad(426.7)
PredictDensityQuant(426.7)
DensityCi(426.7)
DensityPi(426.7)
```


## Scenario 3: Cross-Validation
To check how well your procedure works, omit the set of measurements corresponding to the block of density 0.508, apply your "estimation"/calibration procedure to the remaining data, and provide an interval estimate for the density of a block with an average reading of 38.6. Where does the actual density fall in the interval? Try the same test, for the set of measurements at the 0.001 density.
```{r fig.asp=2, fig.width=5}
for (omitted in c(0.508, 0.001)) {
  # Omit measurements corresponding to the specified density
  df.log.omitted = df.log[which(df.log['density'] != omitted), ]
  df.log.avg.omitted <- df.log.avg[which(df.log.avg['density'] != omitted), ]
  
  
  # Redo calculations using modified dataset
  least.squares <- lm(gain~density, data=df.log.avg.omitted)
  
  mean.density <- mean(df.log.avg.omitted$density)
  summation <- sum((df.log.avg.omitted$density - mean.density) ^ 2)
  
  s2 <- aggregate(list(variance=least.squares.residuals$gain), by=list(density=least.squares.residuals$density), FUN=var)
  s.pooled <- sqrt(mean(s2$variance))
  
  ci.width.expr <- quote(width <- t * s.pooled * sqrt(1/(m-1) + (density-mean.density)^2 / summation))
  pi.width.expr <- quote(width <- t * s.pooled * sqrt(1 + 1/(m-1) + (density-mean.density)^2 / summation))
  
  plot(df.log.omitted, main=title, xlab=x.axis, ylab=y.log.axis, xlim=x.range, ylim=y.range)
  abline(least.squares, col='red')
  
  ci.col <- 'purple'
  pi.col <- 'blue'
  symbol <- '-'
  size <- 1.5
  line.type <- 3
  line.width <- 0.7
  
  confidence.intervals <- data.frame(density=df.log.avg.omitted$density, lower=LogGainCiLower(df.log.avg.omitted$density), upper=LogGainCiUpper(df.log.avg.omitted$density))
  points(x=confidence.intervals$density, y=confidence.intervals$lower, col=ci.col, pch=symbol, cex=size)
  points(x=confidence.intervals$density, y=confidence.intervals$upper, col=ci.col, pch=symbol, cex=size)
  lines(x=confidence.intervals$density, y=confidence.intervals$lower, col=ci.col, lty=line.type, lwd=line.width)
  lines(x=confidence.intervals$density, y=confidence.intervals$upper, col=ci.col, lty=line.type, lwd=line.width)
  
  prediction.intervals <- data.frame(density=df.log.avg.omitted$density, lower=LogGainPiLower(df.log.avg.omitted$density), upper=LogGainPiUpper(df.log.avg.omitted$density))
  points(x=prediction.intervals$density, y=prediction.intervals$lower, col=pi.col, pch=symbol, cex=size)
  points(x=prediction.intervals$density, y=prediction.intervals$upper, col=pi.col, pch=symbol, cex=size)
  lines(x=prediction.intervals$density, y=prediction.intervals$lower, col=pi.col, lty=line.type, lwd=line.width)
  lines(x=prediction.intervals$density, y=prediction.intervals$upper, col=pi.col, lty=line.type, lwd=line.width)
  
  legend('topright', legend=c('Least Squares Regression Line', '95% Confidence Interval Bands', '95% Prediction Interval Bands'), col=c('red', ci.col, pi.col), lty=1)
  
  print(PredictDensityLeastSquares(38.6))  # 38.6 is the average gain for 0.508 density
  print(DensityCi(38.6))
  print(DensityPi(38.6))
  
  print(PredictDensityLeastSquares(426.7))  # 426.7 is the average gain for 0.001 density
  print(DensityCi(426.7))
  print(DensityPi(426.7))
}
```


## Additional Scenario: Temperature, DOY, and Latitude.
Use the additional dataset to construct a model fitting temperature with DOY, latitude, and other reasonable features. Try sketching the least squares line on a scatter plot. We aim to investigate the relationship between temperature and the DOY, and its latitude.
```{r}
# Check the correlation
data <- read.csv('Full Resolution Data/64506420.csv', header=TRUE)
data <- data[,c('Hour','DOY','POS_DOY','Lat','Lon','Ts','BP')]

# Drop the extreme outlier case
#data <- data[which(data$Ts>-200),]
data_matrix <- as.matrix(data)

# Correlation Matrix
corr_matrix <- cor(data_matrix)
corr_matrix
```

```{r}
# least squares line
fit<-lm(formula = Ts ~ DOY + Lat, data = data)
summary(fit)

ggplot(data,aes(x=DOY, y=Ts)) + 
  geom_point(color='#2980B9', size = 4) + 
  geom_smooth(method=lm, color='#2C3E50')

ggplot(data,aes(x=Lat, y=Ts)) + 
  geom_point(color='#2980B9', size = 4) + 
  geom_smooth(method=lm, color='#2C3E50')

title.residuals1 <- 'Residuals of Least Squares Regression Line'
plot(fit$residuals, main=title.residuals1)
abline(0, 0, col='red')

num.bins <- 10
hist(fit$residuals, breaks=num.bins, main=title.residuals1, xlab='Temperature', col='red')

qqnorm(fit$residuals, main=paste('Normal Q-Q Plot with', title.residuals1), cex.main=1)
qqline(fit$residuals, col='red')
```

```{r}
# Attempt for Multiple Regressions
data$Midnights <- 0
data$Midnights[which(data$Hour<6)] <- 1
data$Mornings <- 0
data$Mornings[which(data$Hour>=6 & data$Hour<11)] <- 1 
data$Noon <- 0
data$Noon[which(data$Hour>=11 & data$Hour<15)] <- 1
data$Afternoon <- 0
data$Afternoon[which(data$Hour>=15 & data$Hour<20)] <- 1
data$Nights <- 0
data$Nights[which(data$Hour>=20)] <- 1
```
=======
<<<<<<< HEAD
---
title: 'CASE STUDY 4:'
output: html_notebook
---

This study allows us to revisit/renew

1. Regression modeling
2. Properties of Least Squares/Fitting "a line"
3. Multiple observation

Datasets for this study are

1. The main file: gauge.txt
2. Supplementary large-scale files: download the following folder Full Resolution Data.zip More information about the supplementary file can be found at http://iabp.apl.washington.edu/data.html as well as http://nsidc.org/data/G00791


## Question
The aim of this lab is to provide a simple procedure for converting gain into density when the gauge is in operation. Keep in mind that the experiment was conducted by varying density and measuring the response in gain, but when the gauge is ultimately in use, the snow-pack density is to be estimated from the measured gain.


## Setup
```{r}
df <- read.table('gauge-1wb1wa6-2gpel41.txt', header=TRUE)

#install.packages('L1pack')
library(L1pack)  # Used for least absolute deviations regression line
```


## Scenario 1: Fitting
Use the data to fit the gain, or a transformation of gain, to density. Try sketching the least squares line on a scatter plot.

* Do the residuals indicate any problems with the fit?
* If the densities of the polyethylene blocks are not reported exactly, how might this affect the fit?
* What if the blocks of polyethylene were not measured in random order?
```{r}
# Plot raw data
title <- 'Density vs. Gain'
x.axis <- expression('Density (g/cm'^3*')') 
y.axis <- 'Gain'
plot(df, main=title, xlab=x.axis, ylab=y.axis)


# Take log transformation of response variable (gain)
y.log.axis = 'log(Gain)'
df.log = data.frame(df['density'], log(df['gain']))
plot(df.log, main=title, xlab=x.axis, ylab=y.log.axis)


# Average replicate measurements
df.log.avg = aggregate(list(gain=df.log$gain), by=list(density=df.log$density), FUN=mean)
plot(df.log.avg, main=title, xlab=x.axis, ylab=y.log.axis)


# Fit gain to density
cor(df.log.avg)

least.squares <- lm(gain~density, data=df.log.avg)
least.absolute.deviations <- lad(gain~density, data=df.log.avg)

plot(df.log.avg, main=title, xlab=x.axis, ylab=y.log.axis)
abline(least.squares, col='red')
abline(least.absolute.deviations, col='blue')
legend('topright', legend=c('Least Squares', 'Least Absolute Deviations'), col=c('red', 'blue'), lty=1);

least.squares
summary(least.squares)
least.absolute.deviations
summary(least.absolute.deviations)


# Check conditions for linear regression: linearity, normality of residuals, constant variability
title.residuals1 <- 'Residuals of Least Squares Regression Line'
title.residuals2 <- 'Residuals of Least Absolute Deviations Regression Line'
plot(least.squares$residuals, main=title.residuals1, ylab=y.axis)
abline(0, 0, col='red')
plot(least.absolute.deviations$residuals, main=title.residuals2, ylab=y.axis)
abline(0, 0, col='blue')

num.bins <- 10
hist(least.squares$residuals, breaks=num.bins, main=title.residuals1, xlab=y.axis, col='red')
hist(least.absolute.deviations$residuals, breaks=num.bins, main=title.residuals2, xlab=y.axis, col='blue')

qqnorm(least.squares$residuals, main=paste('Normal Q-Q Plot with', title.residuals1), cex.main=1)
qqline(least.squares$residuals, col='red')
qqnorm(least.absolute.deviations$residuals, main=paste('Normal Q-Q Plot with', title.residuals2), cex.main=1)
qqline(least.absolute.deviations$residuals, col='blue')

# If the densities of the polyethylene blocks were not reported exactly, not only would we not know the density of our snow, our explanatory variable, but we could also retrieve inaccurate data for the gain from the photons. It is possible that if the density of the snow was not properly reported, our whole data set would be flawed and we could arrive at a calibration procedure different from what we would have wanted.

# Since the blocks themselves were in random locations, therefore if the measurement were not taken randomly, then it might happen that the measurements might be taken in a only specific area and thus causing bias.

# If the blocks of polyethylene were not measured in random order, there could be a dependence on the measurements of the snow densities, and our data would not be i.i.d. If we handpicked which blocks to measure or picked them in a predetermined order, the density of the snow blocks could influence how the gain was recorded for each block of snow and also produce inaccurate results.
```


## Scenario 2: Predicting
Ultimately we are interested in answering questions such as: Given a gain reading of 38.6, what is the density of the snow-pack? Given a gain reading of 426.7, what is the density of the snow-pack? These two numeric values, 38.6 and 426.7, were chosen because they are the average gains for the 0.508 and 0.001 densities, respectively.

* Develop a procedure for adding bands around your least squares line that can be used to make interval estimates for the snow-pack density from gain measurements. Keep in mind how the data were collected: several measurements of gain were taken for polyenythylene blocks of known density.

```{r}
pred.int =  predict(least.squares, interval="prediction", level=.95)
plot(pred.int)
```

```{r}
library(ggplot2)
# Least Square b0 and b1 value
b1 <-  as.numeric(coefficients((least.squares))[2])
b0 <-  as.numeric(coefficients((least.squares))[1])

# Function to use gain to predict density 
ls.regression <- function(gain){
  log.gain <-  log(gain)
  estimated.density <-  (log.gain - b0) / b1;
  return(estimated.density);
}

# Estimates for density
density.1 <- ls.regression(38.6)
density.2 <- ls.regression(426.7)

# Graph for Confidence Interval Bands for two estimates
ggplot(df.log.avg,aes(x=density, y=gain)) + 
  geom_point(color='#2980B9', size = 4) + 
  geom_smooth(method=lm, color='#2C3E50')+geom_point() +
  geom_point(aes(x=0.508, y=3.65745), colour="yellow") 

ggplot(df.log.avg,aes(x=density, y=gain)) + 
  geom_point(color='#2980B9', size = 4) + 
  geom_smooth(method=lm, color='#2C3E50')+geom_point() +
  geom_point(aes(x=0.508, y=3.65745), colour="yellow") +
  geom_point()+geom_point(aes(x=0.001, y=5.99266), colour="red")

# Parameters
delta_h <-sd(least.squares$residuals)
z <- qnorm(0.975)
m <- 9
x_bar <- mean(df$density)
den <- sum((df$density-x_bar)**2)

# Prediction Interval 
lower.pi.bound <- function(gain){
  return((-sqrt(den)*sqrt(9*x_bar*x_bar*b1*b1/z/z/delta_h/delta_h+18*x_bar*b1/z/delta_h*(b0-log(gain))/z/delta_h+10*den*b1*b1/z/delta_h/z/delta_h+9*(b0-log(gain))*(b0-log(gain))/z/z/delta_h/delta_h-10)-3*x_bar-3*den*b1*(b0-log(gain))/z/z/delta_h/delta_h)/(3*(den*(b1/z/delta_h)^2)-1))
}
upper.pi.bound <- function(gain){
  return((sqrt(den)*sqrt(9*x_bar*x_bar*b1*b1/z/z/delta_h/delta_h+18*x_bar*b1/z/delta_h*(b0-log(gain))/z/delta_h+10*den*b1*b1/z/delta_h/z/delta_h+9*(b0-log(gain))*(b0-log(gain))/z/z/delta_h/delta_h-10)-3*x_bar-3*den*b1*(b0-log(gain))/z/z/delta_h/delta_h)/(3*(den*(b1/z/delta_h)^2)-1))
}
print("The Prediction Interval for the predicted density given gain = 38.6 is")
lower.pi.bound(38.6)
upper.pi.bound(38.6)
print("The Prediction Interval for the predicted density given gain = 426.7 is")
lower.pi.bound(426.7)
upper.pi.bound(426.7)

# Parameters
t <- qt(0.975, m-2)
# Confidence Interval
lower.ci.bound <- function(gain){
  return((-sqrt(den)*sqrt(9*x_bar*x_bar*b1*b1/t/t/delta_h/delta_h+18*x_bar*b1/t/delta_h*(b0-log(gain))/t/delta_h+9*(b0-log(gain))*(b0-log(gain))/t/t/delta_h/delta_h-1)-3*x_bar-3*den*b1*(b0-log(gain))/t/t/delta_h/delta_h)/(3*(den*(b1/t/delta_h)^2)-1))
}
upper.ci.bound <- function(gain){
  return((sqrt(den)*sqrt(9*x_bar*x_bar*b1*b1/t/t/delta_h/delta_h+18*x_bar*b1/t/delta_h*(b0-log(gain))/t/delta_h+9*(b0-log(gain))*(b0-log(gain))/t/t/delta_h/delta_h-1)-3*x_bar-3*den*b1*(b0-log(gain))/t/t/delta_h/delta_h)/(3*(den*(b1/t/delta_h)^2)-1))
}
print("The Confidence Interval for the predicted density given gain = 38.6 is")
lower.ci.bound(38.6)
upper.ci.bound(38.6)
print("The Confidence Interval for the predicted density given gain = 426.7 is")
lower.ci.bound(426.7)
upper.ci.bound(426.7)

```

```{r}
# Construct Confidence Interval for density = 0.508
new.data <- data.frame(density = 0.508)
log_prediction = predict(least.squares, new.data, interval="confidence")
log_prediction
prediction <- exp(log_prediction)
prediction
# Interpretation: From the output, the fitted gain at a snow density of 0.508 (g/cm'^3*') is around 38.76. The confidence interval of (36.296, 41.3963) signifies the range in which the true population parameter lies at a 95% level of confidence.

# Construct Prediction Interval for density = 0.508
log_prediction = predict(least.squares, new.data, interval="prediction")
log_prediction
prediction <- exp(log_prediction)
prediction
# Interpretation: From the output, the fitted gain at a snow density of 0.508 (g/cm'^3*') is around 38.76. The prediction interval is (32.7544, 45.87231).

# Construct Confidence Interval for density = 0.001
new.data <- data.frame(density = 0.001)
log_prediction = predict(least.squares, new.data, interval="confidence")
log_prediction
prediction <- exp(log_prediction)
prediction
# Interpretation: From the output, the fitted gain at a snow density of 0.508 (g/cm'^3*') is around 400.4783. The confidence interval of (365.3647, 438.9665) signifies the range in which the true population parameter lies at a 95% level of confidence.

# Construct Prediction Interval for density = 0.001
log_prediction = predict(least.squares, new.data, interval="prediction")
log_prediction
prediction <- exp(log_prediction)
prediction
# Interpretation: From the output, the fitted gain at a snow density of 0.508 (g/cm'^3*') is around 400.4783. The prediction interval is (334.4508, 479.541).

```


## Scenario 3: Cross-Validation
To check how well your procedure works, omit the set of measurements corresponding to the block of density 0.508, apply your "estimation"/calibration procedure to the remaining data, and provide an interval estimate for the density of a block with an average reading of 38.6. Where does the actual density fall in the interval? Try the same test, for the set of measurements at the 0.001 density.
```{r}

```


## Additional Scenario: TODO Title
Use the additional dataset to construct a model fitting temperature with DOY, latitude, and other reasonable features. Try sketching the least squares line on a scatter plot. We aim to investigate the relationship between temperature and the DOY, and its Latitude.
```{r}
# Check the correlation
data <- read.csv('Full Resolution Data/64506420.csv', header=TRUE)
data <- data[,c('Hour','DOY','POS_DOY','Lat','Lon','Ts','BP')]

# Drop the extreme outlier case
#data <- data[which(data$Ts>-200),]
data_matrix <- as.matrix(data)

# Correlation Matrix
corr_matrix <- cor(data_matrix)
corr_matrix
```
```{r}
# least squares line
fit<-lm(formula = Ts ~ DOY + Lat, data = data)
summary(fit)

ggplot(data,aes(x=DOY, y=Ts)) + 
  geom_point(color='#2980B9', size = 4) + 
  geom_smooth(method=lm, color='#2C3E50')

ggplot(data,aes(x=Lat, y=Ts)) + 
  geom_point(color='#2980B9', size = 4) + 
  geom_smooth(method=lm, color='#2C3E50')

title.residuals1 <- 'Residuals of Least Squares Regression Line'
plot(fit$residuals, main=title.residuals1)
abline(0, 0, col='red')

num.bins <- 10
hist(fit$residuals, breaks=num.bins, main=title.residuals1, xlab='Temperature', col='red')

qqnorm(fit$residuals, main=paste('Normal Q-Q Plot with', title.residuals1), cex.main=1)
qqline(fit$residuals, col='red')
```

```{r}
# Attempt for Multiple Regressions
data$Midnights <- 0
data$Midnights[which(data$Hour<6)] <- 1
data$Mornings <- 0
data$Mornings[which(data$Hour>=6 & data$Hour<11)] <- 1 
data$Noon <- 0
data$Noon[which(data$Hour>=11 & data$Hour<15)] <- 1
data$Afternoon <- 0
data$Afternoon[which(data$Hour>=15 & data$Hour<20)] <- 1
data$Nights <- 0
data$Nights[which(data$Hour>=20)] <- 1
```
=======
---
title: 'CASE STUDY 4:'
output: html_notebook
---

This study allows us to revisit/renew

1. Regression modeling
2. Properties of Least Squares/Fitting "a line"
3. Multiple observation

Datasets for this study are

1. The main file: gauge.txt
2. Supplementary large-scale files: download the following folder Full Resolution Data.zip More information about the supplementary file can be found at http://iabp.apl.washington.edu/data.html as well as http://nsidc.org/data/G00791


## Question
The aim of this lab is to provide a simple procedure for converting gain into density when the gauge is in operation. Keep in mind that the experiment was conducted by varying density and measuring the response in gain, but when the gauge is ultimately in use, the snow-pack density is to be estimated from the measured gain.


## Setup
```{r}
df <- read.table('gauge-1wb1wa6-2gpel41.txt', header=TRUE)
df <- df[order(df$density), ]  # Sort from least to greatest density

m <- 9  # Number of distinct block densities
t <- 10  # Number of replicate measurements

#install.packages('L1pack')
#install.packages("quantreg")
library(L1pack)  # Used for least absolute deviations regression line
library(quantreg)  # Used for quantile regression line
```


## Scenario 1: Fitting
Use the data to fit the gain, or a transformation of gain, to density. Try sketching the least squares line on a scatter plot.

* Do the residuals indicate any problems with the fit?
* If the densities of the polyethylene blocks are not reported exactly, how might this affect the fit?
* What if the blocks of polyethylene were not measured in random order (location)?
```{r}
# Plot raw data
title <- 'Density vs. Gain'
x.axis <- expression('Density (g/cm'^3*')')
y.axis <- 'Gain'
x.range <- c(0, .7)
y.range <- c(2.5, 6)
plot(df, main=title, xlab=x.axis, ylab=y.axis, xlim=x.range)


# Take log transformation of response variable (gain)
y.log.axis = 'log(Gain)'
df.log = data.frame(df['density'], log(df['gain']))
plot(df.log, main=title, xlab=x.axis, ylab=y.log.axis, xlim=x.range, ylim=y.range)


# Average replicate measurements
df.log.avg = aggregate(list(gain=df.log$gain), by=list(density=df.log$density), FUN=mean)
plot(df.log.avg, main=title, xlab=x.axis, ylab=y.log.axis, xlim=x.range, ylim=y.range)


# Fit gain to density
cor(df.log.avg)

least.squares <- lm(gain~density, data=df.log.avg)
lad <- lad(gain~density, data=df.log.avg)
quant <- rq(gain~density, tau=.5, data=df.log.avg)

plot(df.log, main=title, xlab=x.axis, ylab=y.log.axis, xlim=x.range, ylim=y.range)
abline(least.squares, col='red')
abline(lad, col='blue')
abline(quant, col='green')
legend('topright', legend=c('Least Squares Regression Line', 'Least Absolute Deviations Regression Line', '50% Quantile Regression Line'), col=c('red', 'blue', 'green'), lty=1)

summary(least.squares)
summary(lad)
quant


# Check conditions for linear regression: linearity, normality of residuals, and constant variability
least.squares.residuals <- data.frame(df.log['density'], df.log['gain'] - rep(predict(least.squares), each=10))
lad.residuals <- data.frame(df.log['density'], df.log['gain'] - rep(predict(lad), each=10))
quant.residuals <- data.frame(df.log['density'], df.log['gain'] - rep(predict(quant), each=10))

title.residuals1 <- 'Residuals of Least Squares Regression Line'
title.residuals2 <- 'Residuals of Least Absolute Deviations Regression Line'
title.residuals3 <- 'Residuals of 50% Quantile Regression Line'
plot(least.squares.residuals$gain, main=title.residuals1, ylab=y.axis)
abline(0, 0, col='red')
plot(lad.residuals$gain, main=title.residuals2, ylab=y.axis)
abline(0, 0, col='blue')
plot(quant.residuals$gain, main=title.residuals3, ylab=y.axis)
abline(0, 0, col='green')

num.bins <- 12
hist(least.squares.residuals$gain, breaks=num.bins, main=title.residuals1, xlab=y.axis, col='red')
hist(lad.residuals$gain, breaks=num.bins, main=title.residuals2, xlab=y.axis, col='blue')
hist(quant.residuals$gain, breaks=num.bins, main=title.residuals3, xlab=y.axis, col='green')

qqnorm(least.squares.residuals$gain, main=paste('Normal Q-Q Plot with', title.residuals1), cex.main=1)
qqline(least.squares.residuals$gain, col='red')
qqnorm(lad.residuals$gain, main=paste('Normal Q-Q Plot with', title.residuals2), cex.main=1)
qqline(lad.residuals$gain, col='blue')
qqnorm(quant.residuals$gain, main=paste('Normal Q-Q Plot with', title.residuals3), cex.main=1)
qqline(quant.residuals$gain, col='green')
```


## Scenario 2: Predicting
Ultimately we are interested in answering questions such as: Given a gain reading of 38.6, what is the density of the snow-pack? Given a gain reading of 426.7, what is the density of the snow-pack? These two numeric values, 38.6 and 426.7, were chosen because they are the average gains for the 0.508 and 0.001 densities, respectively.

* Develop a procedure for adding bands around your least squares line that can be used to make interval estimates for the snow-pack density from gain measurements. Keep in mind how the data were collected: several measurements of gain were taken for polyenythylene blocks of known density.
```{r fig.asp=2, fig.width=5}
# Predictions
PredictLogGain <- function(density)
  predict(least.squares, data.frame(density=density))  # Predict log(gain) using density

PredictDensityLeastSquares <- function(gain) {
  intercept <- coef(least.squares)[[1]]
  slope <- coef(least.squares)[[2]]
  (log(gain) - intercept) / slope  # Predict density using gain
}

PredictDensityLad <- function(gain) {
  intercept <- coef(lad)[[1]]
  slope <- coef(lad)[[2]]
  (log(gain) - intercept) / slope  # Predict density using gain
}

PredictDensityQuant <- function(gain) {
  intercept <- coef(quant)[[1]]
  slope <- coef(quant)[[2]]
  (log(gain) - intercept) / slope  # Predict density using gain
}


# 95% prediction and confidence intervals of log(gain) using density
t <- qt(.975, df=m-2)
mean.density <- mean(df.log.avg$density)
summation <- sum((df.log.avg$density - mean.density) ^ 2)

s2 <- aggregate(list(variance=least.squares.residuals$gain), by=list(density=least.squares.residuals$density), FUN=var)
s.pooled <- sqrt(mean(s2$variance))

center.expr <- quote(center <- PredictLogGain(density))
ci.width.expr <- quote(width <- t * s.pooled * sqrt(1/m + (density-mean.density)^2 / summation))
pi.width.expr <- quote(width <- t * s.pooled * sqrt(1 + 1/m + (density-mean.density)^2 / summation))

LogGainCiLower <- function(density) {
  eval(center.expr)
  eval(ci.width.expr)
  center - width
}

LogGainCiUpper <- function(density) {
  eval(center.expr)
  eval(ci.width.expr)
  center + width
}

LogGainPiLower <- function(density) {
  eval(center.expr)
  eval(pi.width.expr)
  center - width
}

LogGainPiUpper <- function(density) {
  eval(center.expr)
  eval(pi.width.expr)
  center + width
}


# Add bands around least squares line
plot(df.log, main=title, xlab=x.axis, ylab=y.log.axis, xlim=x.range, ylim=y.range)
abline(least.squares, col='red')

ci.col <- 'purple'
pi.col <- 'blue'
symbol <- '-'
size <- 1.5
line.type <- 3
line.width <- 0.7

confidence.intervals <- data.frame(density=df.log.avg$density, lower=LogGainCiLower(df.log.avg$density), upper=LogGainCiUpper(df.log.avg$density))
points(x=confidence.intervals$density, y=confidence.intervals$lower, col=ci.col, pch=symbol, cex=size)
points(x=confidence.intervals$density, y=confidence.intervals$upper, col=ci.col, pch=symbol, cex=size)
lines(x=confidence.intervals$density, y=confidence.intervals$lower, col=ci.col, lty=line.type, lwd=line.width)
lines(x=confidence.intervals$density, y=confidence.intervals$upper, col=ci.col, lty=line.type, lwd=line.width)

prediction.intervals <- data.frame(density=df.log.avg$density, lower=LogGainPiLower(df.log.avg$density), upper=LogGainPiUpper(df.log.avg$density))
points(x=prediction.intervals$density, y=prediction.intervals$lower, col=pi.col, pch=symbol, cex=size)
points(x=prediction.intervals$density, y=prediction.intervals$upper, col=pi.col, pch=symbol, cex=size)
lines(x=prediction.intervals$density, y=prediction.intervals$lower, col=pi.col, lty=line.type, lwd=line.width)
lines(x=prediction.intervals$density, y=prediction.intervals$upper, col=pi.col, lty=line.type, lwd=line.width)

legend('topright', legend=c('Least Squares Regression Line', '95% Confidence Interval Bands', '95% Prediction Interval Bands'), col=c('red', ci.col, pi.col), lty=1)


# 95% prediction and confidence intervals of density using gain
end.points <- c(-1, 3)  # Interval to search the root in

DensityCi <- function(gain) {
  lower <- uniroot(function(density) log(gain) - LogGainCiLower(density), interval=end.points)[[1]]
  upper <- uniroot(function(density) log(gain) - LogGainCiUpper(density), interval=end.points)[[1]]
  c(lower, upper)
}

DensityPi <- function(gain) {
  lower <- uniroot(function(density) log(gain) - LogGainPiLower(density), end.points)[[1]]
  upper <- uniroot(function(density) log(gain) - LogGainPiUpper(density), end.points)[[1]]
  c(lower, upper)
}


# Point and interval estimates for example gain readings
PredictDensityLeastSquares(38.6)  # 38.6 is the average gain for 0.508 density
PredictDensityLad(38.6)
PredictDensityQuant(38.6)
DensityCi(38.6)
DensityPi(38.6)

PredictDensityLeastSquares(426.7)  # 426.7 is the average gain for 0.001 density
PredictDensityLad(426.7)
PredictDensityQuant(38.6)
DensityCi(426.7)
DensityPi(426.7)
```


## Scenario 3: Cross-Validation
To check how well your procedure works, omit the set of measurements corresponding to the block of density 0.508, apply your "estimation"/calibration procedure to the remaining data, and provide an interval estimate for the density of a block with an average reading of 38.6. Where does the actual density fall in the interval? Try the same test, for the set of measurements at the 0.001 density.
```{r fig.asp=2, fig.width=5}
for (omitted in c(0.508, 0.001)) {
  # Omit measurements corresponding to the specified density
  df.log.omitted = df.log[which(df.log['density'] != omitted), ]
  df.log.avg.omitted <- df.log.avg[which(df.log.avg['density'] != omitted), ]
  
  
  # Redo calculations using modified dataset
  least.squares <- lm(gain~density, data=df.log.avg.omitted)
  
  mean.density <- mean(df.log.avg.omitted$density)
  summation <- sum((df.log.avg.omitted$density - mean.density) ^ 2)
  
  s2 <- aggregate(list(variance=least.squares.residuals$gain), by=list(density=least.squares.residuals$density), FUN=var)
  s.pooled <- sqrt(mean(s2$variance))
  
  ci.width.expr <- quote(width <- t * s.pooled * sqrt(1/(m-1) + (density-mean.density)^2 / summation))
  pi.width.expr <- quote(width <- t * s.pooled * sqrt(1 + 1/(m-1) + (density-mean.density)^2 / summation))
  
  plot(df.log.omitted, main=title, xlab=x.axis, ylab=y.log.axis, xlim=x.range, ylim=y.range)
  abline(least.squares, col='red')
  
  ci.col <- 'purple'
  pi.col <- 'blue'
  symbol <- '-'
  size <- 1.5
  line.type <- 3
  line.width <- 0.7
  
  confidence.intervals <- data.frame(density=df.log.avg.omitted$density, lower=LogGainCiLower(df.log.avg.omitted$density), upper=LogGainCiUpper(df.log.avg.omitted$density))
  points(x=confidence.intervals$density, y=confidence.intervals$lower, col=ci.col, pch=symbol, cex=size)
  points(x=confidence.intervals$density, y=confidence.intervals$upper, col=ci.col, pch=symbol, cex=size)
  lines(x=confidence.intervals$density, y=confidence.intervals$lower, col=ci.col, lty=line.type, lwd=line.width)
  lines(x=confidence.intervals$density, y=confidence.intervals$upper, col=ci.col, lty=line.type, lwd=line.width)
  
  prediction.intervals <- data.frame(density=df.log.avg.omitted$density, lower=LogGainPiLower(df.log.avg.omitted$density), upper=LogGainPiUpper(df.log.avg.omitted$density))
  points(x=prediction.intervals$density, y=prediction.intervals$lower, col=pi.col, pch=symbol, cex=size)
  points(x=prediction.intervals$density, y=prediction.intervals$upper, col=pi.col, pch=symbol, cex=size)
  lines(x=prediction.intervals$density, y=prediction.intervals$lower, col=pi.col, lty=line.type, lwd=line.width)
  lines(x=prediction.intervals$density, y=prediction.intervals$upper, col=pi.col, lty=line.type, lwd=line.width)
  
  legend('topright', legend=c('Least Squares Regression Line', '95% Confidence Interval Bands', '95% Prediction Interval Bands'), col=c('red', ci.col, pi.col), lty=1)
  
  print(PredictDensityLeastSquares(38.6))  # 38.6 is the average gain for 0.508 density
  print(DensityCi(38.6))
  print(DensityPi(38.6))
  
  print(PredictDensityLeastSquares(426.7))  # 426.7 is the average gain for 0.001 density
  print(DensityCi(426.7))
  print(DensityPi(426.7))
}
```


## Additional Scenario: Temperature, DOY, and Latitude.
Use the additional dataset to construct a model fitting temperature with DOY, latitude, and other reasonable features. Try sketching the least squares line on a scatter plot. We aim to investigate the relationship between temperature and the DOY, and its latitude.
```{r}
# Check the correlation
data <- read.csv('Full Resolution Data/64506420.csv', header=TRUE)
data <- data[,c('Hour','DOY','POS_DOY','Lat','Lon','Ts','BP')]

# Drop the extreme outlier case
#data <- data[which(data$Ts>-200),]
data_matrix <- as.matrix(data)

# Correlation Matrix
corr_matrix <- cor(data_matrix)
corr_matrix
```

```{r}
# least squares line
fit<-lm(formula = Ts ~ DOY + Lat, data = data)
summary(fit)

ggplot(data,aes(x=DOY, y=Ts)) + 
  geom_point(color='#2980B9', size = 4) + 
  geom_smooth(method=lm, color='#2C3E50')

ggplot(data,aes(x=Lat, y=Ts)) + 
  geom_point(color='#2980B9', size = 4) + 
  geom_smooth(method=lm, color='#2C3E50')

title.residuals1 <- 'Residuals of Least Squares Regression Line'
plot(fit$residuals, main=title.residuals1)
abline(0, 0, col='red')

num.bins <- 10
hist(fit$residuals, breaks=num.bins, main=title.residuals1, xlab='Temperature', col='red')

qqnorm(fit$residuals, main=paste('Normal Q-Q Plot with', title.residuals1), cex.main=1)
qqline(fit$residuals, col='red')
```

```{r}
# Attempt for Multiple Regressions
data$Midnights <- 0
data$Midnights[which(data$Hour<6)] <- 1
data$Mornings <- 0
data$Mornings[which(data$Hour>=6 & data$Hour<11)] <- 1 
data$Noon <- 0
data$Noon[which(data$Hour>=11 & data$Hour<15)] <- 1
data$Afternoon <- 0
data$Afternoon[which(data$Hour>=15 & data$Hour<20)] <- 1
data$Nights <- 0
data$Nights[which(data$Hour>=20)] <- 1
```
>>>>>>> 79bd799c1946ef878aa178d035150404c135a16c
>>>>>>> 8826eee1488bdd27557439752deda8eb4e450389
>>>>>>> d4f5f9c0a005e74c03c52f2335922402dbf39c0f